# Load the dataset
scrna.data <- Read10X(data.dir = here::here(
'..','..','scrnaseq_tcsl154',
'cellranger','outs','raw_feature_bc_matrix'))
#rename some adt features
hto_rows <- paste('Hashtag',1:12,sep='_')
adt_feature_names <- fread(here::here(
'..','..','scrnaseq_tcsl154',
'meta','adt_names.tsv'))
rownames(scrna.data$`Antibody Capture`) <- c(paste0(adt_feature_names$vis_name,'.adt'), hto_rows)
#rna features
scrna <- CreateSeuratObject(
counts = scrna.data$`Gene Expression`,
project = "tcsl154",
min.cells = 3,
min.features = 200)
adt_rows <- setdiff(rownames(scrna.data$`Antibody Capture`), hto_rows)
#adt features
scrna[['ADT']] <- CreateAssayObject(
counts = scrna.data$`Antibody Capture`[adt_rows, colnames(scrna)])
scrna[["percent.mt"]] <- PercentageFeatureSet(scrna, pattern = "^MT-")
VlnPlot(scrna,
pt.size=F,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(
scrna, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(
scrna, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
scrna <- subset(
scrna,
subset = nFeature_RNA > 800 & nFeature_RNA < 7000 & percent.mt < 15)
source(here::here('r','scrna','HTOdemux.R'))
scrna[['HTO']] <- CreateAssayObject(
counts = scrna.data$`Antibody Capture`[hto_rows, colnames(scrna)])
#all data loaded, clear raw data from memory and do garbage collect
#rm('scrna.data')
gc()
scrna <- NormalizeData(scrna, assay = "HTO", normalization.method = "CLR")
scrna <- DoubleHTODemux(scrna, positive.quantile = 0.90, ncenters=25)
ggplot(
data.table(scrna@meta.data)[, .N,
by=.(HTO_classification, HTO_clusters, hash.ID)],
aes(y=HTO_classification, x=hash.ID, fill=log10(N))) +
geom_tile() + facet_wrap(~HTO_clusters, scales='free') +
labs(title='Cluster vs HTO assignment')
# assign match for best cluster
cluster.hash.ID <- data.table(scrna@meta.data)[,
.N, by=.(HTO_classification, HTO_clusters, hash.ID)][,
list(cluster.hash.ID= hash.ID[hash.ID != 'Triplets'][
which.max(N[hash.ID != 'Triplets'])]), by=.(HTO_clusters)]
updated_metadata <- cluster.hash.ID[
data.table(scrna@meta.data),
on='HTO_clusters', nomatch=NA, all=T][
is.na(cluster.hash.ID), cluster.hash.ID := 'Triplet']
updated_metadata[, HTO_best_doublet := gsub('\\+\\d+','',HTO_classification)]
updated_metadata[, HTO_miscluster := (
as.character(cluster.hash.ID) != as.character(HTO_best_doublet))]
# margin to throw away clustering points if third-closest HTO's
# signal strength relative to second is beyond this
HTO_margin_cutoff <- -1
ggplot(updated_metadata[
HTO_miscluster == F &
(as.character(hash.ID) != as.character(cluster.hash.ID))]) +
geom_jitter(aes(x=HTO_margin, y=HTO_classification), alpha=0.2) +
facet_wrap(~cluster.hash.ID, scales='free') +
geom_vline(aes(xintercept=-1))
# mark cells with HTO margin above cutoff as misclustered
updated_metadata[
(as.character(hash.ID) != as.character(cluster.hash.ID)) &
HTO_margin > HTO_margin_cutoff,
HTO_miscluster := T]
# assign cluster hashes if not misclustered
updated_metadata[HTO_miscluster == F, hash.ID := cluster.hash.ID]
# merge with metadata file, add to HTO scrna metadata
HTO_sample_metadata <- fread(here::here(
'..','..','scrnaseq_tcsl154','meta','sample_metadata.csv'))
names(HTO_sample_metadata)[2] <- 'hash.ID'
updated_metadata <- HTO_sample_metadata[updated_metadata, on='hash.ID']
#if hash combo is not a sample, label negative
updated_metadata[
is.na(sample_num) & !(hash.ID %in% c('Triplets', 'Singlets', 'Negative')),
hash.ID := 'Negative']
# update metadata in object
updated_metadata <- as.data.frame(updated_metadata)
rownames(updated_metadata) <- colnames(scrna)
scrna@meta.data <- updated_metadata
#assign new cell identities
Idents(object = scrna) <- scrna@meta.data$hash.ID
#make a subset for tsne plot
hto_subset <- subset(scrna, cells=sample(Cells(scrna), 10000))
hto.dist.mtx <- as.matrix(parDist(t(
GetAssayData(object = hto_subset, assay = "HTO"))))
hto_subset <- RunTSNE(hto_subset,
distance.matrix = hto.dist.mtx,
perplexity = 100)
DimPlot(hto_subset, label=T)
DimPlot(hto_subset, split='hash.ID',ncol=4)
scrna.hashed <- subset(scrna, idents = HTO_sample_metadata$hash.ID)
DefaultAssay(scrna.hashed) <- "RNA"
scrna.hashed <- NormalizeData(scrna.hashed,
assay='RNA',
normalization.method = "LogNormalize", scale.factor = 10000)
scrna.hashed <- ScaleData(
scrna.hashed, assay='RNA',
features= VariableFeatures(scrna.hashed))
scrna.hashed <- FindVariableFeatures(scrna.hashed,
selection.method = "vst",
nfeatures = 2000)
saveRDS(scrna.hashed, file = here::here(
'..','..','scrnaseq_tcsl154',
'rds','scrna.hashed.rds'))
First, get adt tags that separate CD4s and CD8s, and add rnas as controls.
if (!('scrna.hashed' %in% ls()))
scrna.hashed <- readRDS(here::here(
'..','..','scrnaseq_tcsl154',
'rds','scrna.hashed.rds'))
DefaultAssay(scrna.hashed) <- "ADT"
scrna.hashed <- NormalizeData(scrna.hashed, assay = "ADT", normalization.method = "CLR")
## Normalizing across features
scrna.hashed <- ScaleData(scrna.hashed, assay = "ADT")
## Centering and scaling data matrix
scrna.hashed <- FindVariableFeatures(scrna.hashed)
adt_features <- scrna.hashed@assays$ADT@var.features
cd4v8_subset_vars <- c(adt_features, 'rna_CD8A','rna_CD8B','rna_CD4')
cd4v8.dt <- data.table(FetchData(
object = scrna.hashed,
vars = cd4v8_subset_vars))
combined_pca <- prcomp(cd4v8.dt)
# calculate pca stats
pca.dt <- data.table(pc = data.table(colnames(combined_pca$rotation))[,
PC := as.integer(gsub("[A-Z]", "", V1))][, PC],
sd = combined_pca$sdev,
var = combined_pca$sdev^2,
var.norm = combined_pca$sdev^2/sum(combined_pca$sdev^2),
var.acc = cumsum(combined_pca$sdev^2/sum(combined_pca$sdev^2)))
melt.pca.dt <- melt(
pca.dt, measure.vars = c("var.norm", "var.acc"), variable.name = "metric")
pca.stats.plot <- ggplot(melt.pca.dt) +
geom_line(aes(x = pc, y = value, color = metric)) +
geom_point(aes(x = pc, y = value, color = metric)) +
scale_x_continuous(limits = c(1, NA)) +
labs(title = "Fraction of Variance Captured by Principal Components, Every Sort Group",
x = "Principal Component", y = "Fraction of Variance") +
theme_bw()
pca.stats.plot
# project data onto principal components
projected.metrics <- scale(cd4v8.dt, combined_pca$center, combined_pca$scale) %*% combined_pca$rotation
#vectorized switch:
names_4v8 <- c('4-8-', '8+4-', '4+8-', '4+8+')
categories_4v8 <- c('CD8','intermediate','intermediate','CD4')
projected.metrics <- cbind.data.frame(cd4v8.dt, projected.metrics)
projected.metrics[, is_CD4_adt := CD4.adt > 1.2]
projected.metrics[, is_CD8_adt := CD8A.adt > 0.5]
projected.metrics[, adt_state := names_4v8[1+(2*is_CD4_adt + is_CD8_adt)]]
projected.metrics[, is_CD4_rna := rna_CD4 > 0]
projected.metrics[, is_CD8B_rna := rna_CD8B > 0]
projected.metrics[, is_CD8A_rna := rna_CD8A > 0]
projected.metrics[, rna_state := names_4v8[1+(2*is_CD4_rna + (is_CD8B_rna | is_CD8A_rna))]]
intercepts <- c(.89, 1.95)
slopes <- c(.77, 1.45)
projected.metrics[, cd8_classify := PC2 > (slopes[1]*PC1 + intercepts[1])]
projected.metrics[, cd4_classify := PC2 > (slopes[2]*PC1 + intercepts[2])]
projected.metrics[, cd_category := categories_4v8[1+(2*cd4_classify + cd8_classify)]]
plot_grid(
ggplot(projected.metrics[], aes(x = PC1, y = PC2, color=adt_state)) +
geom_point(alpha=0.1) +
geom_abline(intercept=intercepts, slope=slopes),
ggplot(projected.metrics[], aes(x = PC1, y = PC2, color=adt_state)) +
facet_wrap(~adt_state) +
geom_point(alpha=0.1) +
geom_abline(intercept=intercepts, slope=slopes),
ggplot(projected.metrics[], aes(x = PC1, y = PC2, color=adt_state)) +
facet_wrap(~rna_state) +
geom_point(alpha=0.1) +
geom_abline(intercept=intercepts, slope=slopes),
ggplot(projected.metrics[], aes(x = PC1, y = PC2, color=cd_category)) +
geom_point(alpha=0.1) +
geom_abline(intercept=intercepts, slope=slopes),
ncol=1,
labels=c('', 'Grouped by presence of CD4/CD8 ADT', 'Grouped by presence of CD4/CD8 RNA','Manual Classification'),
scale=0.9)
scrna.hashed@meta.data[['CD4v8']] <- projected.metrics$cd_category
if (reload_data) saveRDS(scrna.hashed, file = here::here(
'..','..','scrnaseq_tcsl154',
'rds','scrna.hashed.rds'))
if (!('scrna.hashed' %in% ls()))
scrna.hashed <- readRDS(here::here(
'..','..','scrnaseq_tcsl154',
'rds','scrna.hashed.rds'))
umap_plot <- function(subset, plot_title_text="", assay, cluster_resolution) {
subset.markers <- FindAllMarkers(subset,
features = VariableFeatures(subset),
min.pct = 0.5, logfc.threshold = 0.25)
top.subset.markers <- subset.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
top_genes_plot <- ggplot(top.subset.markers) +
geom_bar(aes(y=avg_logFC, x=reorder(gene, avg_logFC)), stat='identity') +
facet_wrap(~cluster, scales='free', ncol=3) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
cluster_col_name <- paste0(assay,'_snn_res.',as.character(cluster_resolution))
cluster_pct_plot <- ggplot(
data.table(subset@meta.data)[
CD4v8 != 'intermediate', .N,
by=c('car', cluster_col_name, 'CD4v8', 'k_type')][,
list(cluster=get(cluster_col_name), t_type=CD4v8, frac=N/sum(N)),
by=c('car', 'CD4v8', 'k_type')]) +
geom_bar(aes(x=car, y=frac, fill=frac), stat='identity', color='grey50') +
scale_fill_distiller(palette='YlGnBu', direction=1) +
facet_grid(k_type+t_type~cluster) +
theme_bw() +
coord_flip()
plot_title <- ggdraw() +
draw_label(
plot_title_text,
fontface = 'bold',
x = 0,
hjust = 0
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
)
gene_heatmap <- DoHeatmap(subset,
features = unique(
(top.subset.markers %>% group_by(cluster)
%>% top_n(n=5, wt= avg_logFC))$gene))
umap_plots <- plot_grid(
plot_grid(
plot_title,
DimPlot(subset, reduction = "umap", label=T),
cluster_pct_plot,
rel_heights=c(0.2,1.25,1),
ncol=1),
gene_heatmap,
ncol=2)
return(umap_plots)
}
umap_subset <- function(this_subset,
plot_title_text="", assay='RNA', cluster_resolution=0.8, save=T, rerun=F) {
DefaultAssay(this_subset) <- assay
this_subset <- subset(this_subset,
features=rownames(this_subset@assays[[assay]]))
rds_file <- here::here(
'..','..','scrnaseq_tcsl154',
'rds',
paste0(
'scrna.hashed.',
gsub(' ','.',plot_title_text),
'.rds'))
if (rerun || !file.exists(rds_file)) {
if (assay == 'RNA') {
this_subset <- NormalizeData(
this_subset,
normalization.method = "LogNormalize",
scale.factor = 10000)
} else if (assay == 'ADT') {
this_subset <- NormalizeData(
this_subset,
normalization.method = "CLR")
}
this_subset <- FindVariableFeatures(
this_subset,
selection.method = "vst",
nfeatures = 2000)
this_subset <- ScaleData(
this_subset, assay=assay,
features= VariableFeatures(this_subset))
this_subset <- RunPCA(
this_subset, features = VariableFeatures(this_subset))
this_subset <- FindNeighbors(this_subset, reduction = "pca", dims = 1:8)
this_subset <- FindClusters(this_subset, resolution = cluster_resolution, algorithm=1)
this_subset <- RunUMAP(this_subset, dims = 1:8)
this_subset@meta.data[['car.ttype.ktype']] <- paste(
this_subset@meta.data[['car']],
this_subset@meta.data[['CD4v8']],
this_subset@meta.data[['k_type']], sep='_')
} else {
this_subset <- readRDS(rds_file)
}
subset_umap_plot <- umap_plot(this_subset, plot_title_text, assay, cluster_resolution)
if ((rerun | (!file.exists(rds_file))) & save)
saveRDS(this_subset, file = rds_file)
return(list(plots=subset_umap_plot, rds_file=rds_file))
}
subset_fxns= list(
function(obj) obj,
function(obj) subset(obj, subset = (CD4v8 == 'CD4')),
function(obj) subset(obj, subset = (CD4v8 == 'CD8')),
function(obj) subset(obj, subset = (k_type == 'cd19+')),
function(obj) subset(obj, subset = (k_type == 'cd19+' & CD4v8 == 'CD4')),
function(obj) subset(obj, subset = (k_type == 'cd19+' & CD4v8 == 'CD8')))
subset_names= c('all cells','CD4','CD8','CD19','CD4 CD19','CD8 CD19')
umap_plots <- data.table(subset_name= subset_names, subset_fxn_num=1:6)
umap_plots[, ADT := list(list(
umap_subset(subset_fxns[[subset_fxn_num]](scrna.hashed), plot_title_text=paste0(.BY[1],' ADT'), assay='ADT'))),
by='subset_name']
umap_plots[, RNA := list(list(
umap_subset(subset_fxns[[subset_fxn_num]](scrna.hashed), plot_title_text=paste0(.BY[1],' RNA'), assay='RNA'))),
by='subset_name']
saveRDS(umap_plots, file = here::here(
'..','..','scrnaseq_tcsl154',
'rds','scrna.umap.plots.rds'))
if (!('umap_plots' %in% ls()))
umap_plots <- readRDS(here::here(
'..','..','scrnaseq_tcsl154',
'rds','scrna.umap.plots.rds'))
umap_plots[, print(ADT[[1]]), by='subset_name']
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.all.cells.ADT.rds"
##
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD4.ADT.rds"
##
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD8.ADT.rds"
##
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD19.ADT.rds"
##
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD4.CD19.ADT.rds"
##
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD8.CD19.ADT.rds"
if (!('scrna.hashed' %in% ls()))
scrna.hashed <- readRDS(here::here(
'..','..','scrnaseq_tcsl154',
'rds','scrna.hashed.rds'))
umap_plots[, print(RNA[[1]]), by='subset_name']
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.all.cells.RNA.rds"
##
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD4.RNA.rds"
##
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD8.RNA.rds"
##
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD19.RNA.rds"
##
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD4.CD19.RNA.rds"
##
## $plots
##
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD8.CD19.RNA.rds"